In [1]:
import gc

import geopandas as gpd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from itables import show
from sklearn.cluster import DBSCAN
from tslearn.clustering import TimeSeriesKMeans

%matplotlib inline
%config InlineBackend.figure_formats = ['retina']

pd.set_option("mode.copy_on_write", True)

Sample¶

In [2]:
df = pd.read_parquet("data/nyc/yellow_tripdata_2024-03.parquet")
In [3]:
gdf = gpd.read_file("zip://data/nyc/taxi_zones.zip")
In [4]:
show(gdf, scrollX=True)
OBJECTID Shape_Leng Shape_Area zone LocationID borough geometry
Loading ITables v2.1.0 from the internet... (need help?)

Explore¶

In [5]:
gdf.borough.unique()
Out[5]:
array(['EWR', 'Queens', 'Bronx', 'Manhattan', 'Staten Island', 'Brooklyn'],
      dtype=object)
In [6]:
def plot_borough():
    fig = plt.figure(figsize=(10, 10))
    ax = fig.gca()
    gdf.plot(ax=ax)

    for borough in gdf.borough.unique():
        gdf_subset = gdf[gdf["borough"] == borough]
        centroid = gdf_subset["geometry"].centroid
        centroid_x = centroid.x.mean()
        centroid_y = centroid.y.mean()

        ax.text(
            centroid_x,
            centroid_y,
            borough,
            color="white",
            ha="center",
            va="center",
        )

    plt.show()
In [7]:
plot_borough()
No description has been provided for this image
In [8]:
def plot_zones(borough: str, figsize: tuple):
    fig = plt.figure(figsize=figsize)
    ax = fig.gca()

    if borough == "all":
        gdf_subset = gdf[gdf["borough"] != "Manhattan"]
    else:
        gdf_subset = gdf[gdf["borough"] == borough]

    gdf_subset.plot(ax=ax)

    for t in gdf_subset.itertuples():
        centroid = t.geometry.centroid
        ax.text(
            centroid.x,
            centroid.y,
            t.OBJECTID,
            fontsize=8,
            color="white",
            ha="center",
            va="center",
        )

    plt.show()
In [9]:
plot_zones("all", (12, 12))
plot_zones("Manhattan", (12, 12))
No description has been provided for this image
No description has been provided for this image

Modify¶

Rename¶

In [10]:
df_renamed = df.rename(
    {
        "VendorID": "vendor_id",
        "RatecodeID": "rate_code_id",
        "PULocationID": "pick_up_location_id",
        "DOLocationID": "drop_off_location_id",
        "Airport_fee": "airport_fee",
    },
    axis=1,
)

Remove Duplicates¶

In [11]:
df_deduped = df_renamed.drop_duplicates()
len(df_renamed) - len(df_deduped)
Out[11]:
0

Filter Examples¶

In [12]:
start_date_mask = df_deduped["tpep_pickup_datetime"] >= "2024-03-01"
end_date_mask = df_deduped["tpep_pickup_datetime"] < "2024-04-01"

df_filtered = df_renamed[start_date_mask & end_date_mask]
len(df_deduped) - len(df_filtered)
Out[12]:
23

Integrate Centroids¶

In [13]:
gdf["geometry_x"] = gdf["geometry"].apply(lambda x: x.centroid.x)
gdf["geometry_y"] = gdf["geometry"].apply(lambda x: x.centroid.y)
In [14]:
df_pu = (
    df_filtered.merge(
        right=gdf[["OBJECTID", "geometry_x", "geometry_y"]],
        left_on="pick_up_location_id",
        right_on="OBJECTID",
    )
    .drop("OBJECTID", axis=1)
    .rename(mapper={"geometry_x": "pick_up_x", "geometry_y": "pick_up_y"}, axis=1)
)
In [15]:
df_do = (
    df_pu.merge(
        right=gdf[["OBJECTID", "geometry_x", "geometry_y"]],
        left_on="drop_off_location_id",
        right_on="OBJECTID",
    )
    .drop("OBJECTID", axis=1)
    .rename(mapper={"geometry_x": "drop_off_x", "geometry_y": "drop_off_y"}, axis=1)
)
In [16]:
len(df_filtered) - len(df_do)
Out[16]:
35374
In [17]:
show(df_do.iloc[:300], scrollX=True)
vendor_id tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance rate_code_id store_and_fwd_flag pick_up_location_id drop_off_location_id payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge airport_fee pick_up_x pick_up_y drop_off_x drop_off_y
Loading ITables v2.1.0 from the internet... (need help?)
In [18]:
# df_do.to_parquet("data/nyc/yellow_tripdata_2024-03.uc.parquet", compression=None)
In [19]:
del df
del df_renamed
del df_deduped
del df_filtered
del df_pu

gc.collect()
Out[19]:
25263

Model¶

Time-Series Clustering¶

In [20]:
df_ts = df_do.set_index("tpep_pickup_datetime")
n_passengers = df_ts["passenger_count"].resample("h").sum().to_frame()
In [21]:
n_passengers["date"] = n_passengers.index.strftime("%Y-%m-%d")
n_passengers["time"] = n_passengers.index.strftime("%H:%M")

n_passengers_daily = n_passengers.pivot(
    index="date",
    columns="time",
    values="passenger_count",
)

n_passengers_daily["day_name"] = n_passengers_daily.index.map(
    lambda x: pd.to_datetime(x).day_name()
)

n_passengers_daily_inp = n_passengers_daily.iloc[:, :24]

K-Means DTW¶

In [22]:
wcss = []

for i in range(1, 11):
    kmeans_ts = TimeSeriesKMeans(n_clusters=i, metric="dtw", random_state=12345)
    kmeans_ts.fit(n_passengers_daily_inp)
    wcss.append(kmeans_ts.inertia_)
In [23]:
plt.figure(figsize=(6, 3))

plt.plot(range(1, 11), wcss)
plt.ylabel("Within Cluster Sum of Squares")
plt.xlabel("Clusters")
plt.xticks(range(1, 11))

plt.show()
No description has been provided for this image
In [24]:
kmeans_ts = TimeSeriesKMeans(n_clusters=3, metric="dtw", random_state=12345)
n_passengers_daily["kmeans_clusters"] = kmeans_ts.fit_predict(n_passengers_daily_inp)

DBSCAN¶

In [25]:
dbscan_ts = DBSCAN(eps=3500, min_samples=5)
n_passengers_daily["dbscan_clusters"] = dbscan_ts.fit_predict(n_passengers_daily_inp)

Results¶

In [26]:
show(n_passengers_daily, scrollX=True)
time 00:00 01:00 02:00 03:00 04:00 05:00 06:00 07:00 08:00 09:00 10:00 11:00 12:00 13:00 14:00 15:00 16:00 17:00 18:00 19:00 20:00 21:00 22:00 23:00 day_name kmeans_clusters dbscan_clusters
date
Loading ITables v2.1.0 from the internet... (need help?)
In [27]:
def plot_clusters_ts(algorithm: str):
    col_name = f"{algorithm}_clusters"

    plt.figure(figsize=(10, 6))
    colors = plt.cm.tab10(np.arange(10))
    anomalous_idx = n_passengers_daily[col_name].max() + 1

    for i, day in enumerate(n_passengers_daily.index):
        cluster = n_passengers_daily[col_name].iloc[i]

        if cluster == -1:
            color = anomalous_idx
            label = "Noises"
        else:
            label = f"Cluster {cluster}"
            color = cluster

        plt.plot(
            n_passengers_daily.columns[:24],
            n_passengers_daily.iloc[:, :24].loc[day],
            color=colors[color],
            label=label,
        )

    handles, labels = plt.gca().get_legend_handles_labels()
    unique_labels = list(set(labels))
    unique_handles = [handles[labels.index(label)] for label in unique_labels]

    plt.ylabel("Passenger Count")
    plt.xlabel("Hours (March 2024)")
    plt.xticks(rotation=45)

    handles, labels = plt.gca().get_legend_handles_labels()
    unique_labels = sorted(list(set(labels)))
    unique_handles = [handles[labels.index(label)] for label in unique_labels]
    plt.legend(unique_handles, unique_labels)

    plt.show()
In [28]:
plot_clusters_ts(algorithm="kmeans")
No description has been provided for this image
In [29]:
plot_clusters_ts(algorithm="dbscan")
No description has been provided for this image
In [30]:
def tabulate_clusters_ts(algorithm: str):
    n_passengers_daily_agg = (
        n_passengers_daily.reset_index()
        .groupby(f"{algorithm}_clusters")
        .agg({"date": list, "day_name": lambda x: list(set(x))})
    )

    show(n_passengers_daily_agg, scrollX=True)
In [31]:
tabulate_clusters_ts(algorithm="kmeans")
time date day_name
kmeans_clusters
Loading ITables v2.1.0 from the internet... (need help?)
In [32]:
tabulate_clusters_ts(algorithm="dbscan")
time date day_name
dbscan_clusters
Loading ITables v2.1.0 from the internet... (need help?)

Anomaly Detection in Time Series¶

In [33]:
df_tp = df_do.set_index("tpep_pickup_datetime")
n_passengers_min = df_tp["passenger_count"].resample("min").sum().to_frame()
n_passengers_min["day_name"] = n_passengers_min.index.map(lambda x: x.day_name())

DBSCAN¶

In [34]:
dbscan_tp = DBSCAN(eps=1, min_samples=50)
n_passengers_min["clusters"] = dbscan_tp.fit_predict(n_passengers_min.iloc[:, [0]])

Results¶

In [35]:
n_passengers_min_noises = n_passengers_min[n_passengers_min["clusters"] == -1]
show(n_passengers_min_noises, scrollX=True)
passenger_count day_name clusters
tpep_pickup_datetime
Loading ITables v2.1.0 from the internet... (need help?)
In [36]:
plt.figure(figsize=(12, 8))

for cluster in n_passengers_min["clusters"].unique():
    cluster_df = n_passengers_min[n_passengers_min["clusters"] == cluster]

    if cluster == -1:
        label = "Anomalous"
    else:
        label = "Normal Points"

    plt.scatter(cluster_df.index, cluster_df["passenger_count"], label=label)

plt.ylabel("Passenger Count")
plt.xlabel("Days (March 2024)")
plt.xticks(pd.date_range("2024-03-01", "2024-04-01"), rotation=45)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%m-%d"))
plt.legend()

plt.show()
No description has been provided for this image
In [37]:
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
days_ord = {day: i for i, day in enumerate(days)}

n_passengers_min_noises_days = n_passengers_min_noises.groupby("day_name").size()
n_passengers_min_noises_days = n_passengers_min_noises_days.to_frame()
n_passengers_min_noises_days["order"] = n_passengers_min_noises_days.index.map(days_ord)
n_passengers_min_noises_days.sort_values(by="order", inplace=True)
show(n_passengers_min_noises_days, scrollX=True)
0 order
day_name
Loading ITables v2.1.0 from the internet... (need help?)
In [38]:
plt.figure(figsize=(6, 3))

plt.bar(n_passengers_min_noises_days.index, n_passengers_min_noises_days[0])
plt.ylabel("Count")
plt.xlabel("Days of the Week")
plt.show()
No description has been provided for this image
In [39]:
hours = n_passengers_min_noises.index.hour
n_passengers_min_noises_hours = n_passengers_min_noises.groupby(hours).size()

show(n_passengers_min_noises_hours, scrollX=True)
0
tpep_pickup_datetime
Loading ITables v2.1.0 from the internet... (need help?)
In [40]:
plt.figure(figsize=(6, 3))

plt.bar(n_passengers_min_noises_hours.index, n_passengers_min_noises_hours)
plt.ylabel("Count")
plt.xlabel("Hours of a Day")
plt.xticks(n_passengers_min_noises_hours.index)
plt.show()
No description has been provided for this image

Hotspot Detection¶

In [41]:
n_trips_pu = df_do[df_do["tpep_pickup_datetime"].dt.hour == 18]
In [42]:
def stratified_sample_per_day(group, proportion=0.2):
    n_samples = max(1, int(len(group) * proportion))

    return group.sample(n=n_samples, random_state=12345)
In [43]:
n_trips_pu_sampled = (
    n_trips_pu.groupby(n_trips_pu["tpep_pickup_datetime"].dt.date)
    .apply(stratified_sample_per_day)
    .reset_index(drop=True)
)

n_trips_pu_inp = n_trips_pu_sampled[["pick_up_x", "pick_up_y"]]

DBSCAN¶

In [44]:
dbscan_pu = DBSCAN(eps=3850, min_samples=50)
n_trips_pu_sampled["clusters_pu"] = dbscan_pu.fit_predict(n_trips_pu_inp)

Results¶

In [45]:
show(n_trips_pu_sampled.iloc[:300], scrollX=True)
vendor_id tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance rate_code_id store_and_fwd_flag pick_up_location_id drop_off_location_id payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge airport_fee pick_up_x pick_up_y drop_off_x drop_off_y clusters_pu
Loading ITables v2.1.0 from the internet... (need help?)
In [46]:
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
anomalous_idx = n_trips_pu_sampled["clusters_pu"].max() + 1

gdf.plot(ax=ax, alpha=0.1)

for cluster in np.sort(n_trips_pu_sampled["clusters_pu"].unique()):
    if cluster == -1:
        label = "Noises"
    else:
        label = f"Cluster {cluster}"

    points = n_trips_pu_sampled[n_trips_pu_sampled["clusters_pu"] == cluster]
    ax.scatter(points["pick_up_x"], points["pick_up_y"], label=label, marker="x")

plt.legend()
plt.show()
No description has been provided for this image
In [47]:
n_trips_clusters = n_trips_pu_sampled["clusters_pu"].value_counts().sort_index()
show(n_trips_clusters, scrollX=True)
count
clusters_pu
Loading ITables v2.1.0 from the internet... (need help?)
In [48]:
plt.figure(figsize=(6, 3))

plt.bar(n_trips_clusters.index, n_trips_clusters)
plt.ylabel("Number of Trips")
plt.yscale("log")
plt.xlabel("Clusters")
plt.xticks(n_trips_clusters.index)

plt.show()
No description has been provided for this image
In [ ]: